library(tidyverse)
library(edgeR)
library(AnnotationHub)
library(magrittr)
library(scales)
library(GenomicRanges)
library(cqn)
library(fgsea)
library(kableExtra)
if (interactive()) setwd(here::here("R"))
theme_set(theme_bw())
nCores <- min(12, detectCores() - 1)
# # Get GC content and lengths
# gcTrans <- url("https://uofabioinformaticshub.github.io/Ensembl_GC/Release94/Danio_rerio.GRCz11.94.rds") %>%
# readRDS()
# # Convert to the gene-level with average length, average GC, max length, and GC of longest transcript for all transcripts assigned to a single gene.
# gcGene <- gcTrans %>%
# split(f = .$gene_id) %>%
# mclapply(function(x){
# gr <- reduce(x)
# df <- DataFrame(
# gene_id = unique(x$gene_id),
# gene_symbol = unique(x$gene_symbol),
# aveLength = round(mean(x$length),0) %>% as.integer(),
# aveGc = sum(x$length * x$gc) / sum(x$length),
# maxLength = max(x$length),
# longestGc = x$gc[which.max(x$length)[[1]]]
# )
# mcols(gr) <- df
# gr
# }, mc.cores = 4) %>%
# GRangesList() %>%
# unlist() %>%
# na.omit()
# # This takes a reasonable amount of time
# # Save as .rds file for faster loading
# saveRDS("../files/gcGene.rds")
gcGene <- readRDS("../files/gcGene.rds")
# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids
convertHsEG2Dr <- function(ids, df = idConvert){
dplyr::filter(df, EntrezID %in% ids)$Geneid
}
# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
dplyr::select(label = zfID, symbol = zfName) %>%
na.omit() %>%
unique()
# Import hallmark human gene genesets and tidy the gene set names
# .gmt file downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "HALLMARK_"))
# Load counts analysed by feature counts
counts_19 <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
dplyr::select(Geneid, starts_with("W"), starts_with("Q"))
# Create DGEList and calculate normalisaton factors
dgeList_19 <- counts_19 %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
# Set group variable
dgeList_19$samples$group <- colnames(dgeList_19) %>%
str_extract("(W|Q)") %>%
factor(levels = c("W", "Q"))
# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
subset(species == "Danio rerio") %>%
subset(dataprovider == "Ensembl") %>%
subset(rdataclass == "EnsDb")
## AnnotationHub with 12 records
## # snapshotDate(): 2019-10-29
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## # rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH53201"]]'
##
## title
## AH53201 | Ensembl 87 EnsDb for Danio Rerio
## AH53705 | Ensembl 88 EnsDb for Danio Rerio
## AH56671 | Ensembl 89 EnsDb for Danio Rerio
## AH57746 | Ensembl 90 EnsDb for Danio Rerio
## AH60762 | Ensembl 91 EnsDb for Danio Rerio
## ... ...
## AH64906 | Ensembl 94 EnsDb for Danio rerio
## AH67932 | Ensembl 95 EnsDb for Danio rerio
## AH69169 | Ensembl 96 EnsDb for Danio rerio
## AH73861 | Ensembl 97 EnsDb for Danio rerio
## AH74989 | Ensembl 98 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH64906"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")] %>%
as.data.frame() %>%
dplyr::select(-entrezid) %>%
left_join(as.data.frame(mcols(gcGene))) %>%
distinct(gene_id, .keep_all = TRUE) %>%
set_rownames(.$gene_id) %>%
DataFrame() %>%
.[names(genesGR),]
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList_19$genes <- genesGR[rownames(dgeList_19),]
# Create logical vector of genes to keep that fit criteria
genes2keep_19 <- dgeList_19 %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt_19 <- dgeList_19[genes2keep_19,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Create model matrix
design_19 <- model.matrix(~group, data = dgeFilt_19$samples)
# Perform DE test
topTable_19 <- estimateGLMCommonDisp(dgeFilt_19) %>%
glmFit(design_19) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
set_colnames(str_remove(colnames(.), "ID.")) %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr = logCPM,
logFC,
P.Value = PValue,
FDR,
Location,
aveLength,
aveGc,
maxLength,
longestGc
) %>%
mutate(
DE = FDR < 0.05,
RankStat = -sign(logFC)*log10(P.Value)
)
if (!interactive()) {
topTable_19 %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
dplyr::slice(1:20) %>%
mutate(
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
kable(caption = "The 20 most differentially expressed genes") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000091368 | AL954327.1 | 2.656650 | -5.906385 | 1.45e-98 | 2.67e-94 |
| ENSDARG00000111860 | exorh | 3.993233 | 3.405441 | 1.63e-74 | 1.49e-70 |
| ENSDARG00000009637 | rcvrn3 | 2.423242 | 3.635172 | 4.41e-68 | 2.70e-64 |
| ENSDARG00000111827 | rcvrnb | 3.200713 | 3.212106 | 3.16e-63 | 1.45e-59 |
| ENSDARG00000059163 | rbp3 | 2.234735 | 3.296390 | 3.12e-57 | 1.14e-53 |
| ENSDARG00000098249 | asmt | 2.165764 | 3.257285 | 9.65e-56 | 2.95e-52 |
| ENSDARG00000002768 | pvalb2 | 2.238580 | -3.378321 | 4.41e-50 | 1.16e-46 |
| ENSDARG00000093753 | BX004774.2 | 2.161002 | 2.809252 | 2.75e-44 | 6.32e-41 |
| ENSDARG00000067990 | myhz1.1 | 2.622203 | -2.876345 | 9.63e-43 | 1.96e-39 |
| ENSDARG00000053254 | mylpfa | 2.393426 | -2.878776 | 3.33e-41 | 6.11e-38 |
| ENSDARG00000040565 | ckmb | 2.632848 | -2.787234 | 8.11e-41 | 1.35e-37 |
| ENSDARG00000024877 | ptgr1 | 4.624935 | 2.330074 | 1.38e-40 | 2.11e-37 |
| ENSDARG00000035327 | ckma | 2.350054 | -2.799236 | 2.80e-39 | 3.95e-36 |
| ENSDARG00000075963 | mhc1uba | 5.918828 | 2.189626 | 1.24e-37 | 1.62e-34 |
| ENSDARG00000099197 | actc1b | 3.642893 | -2.441845 | 3.51e-37 | 4.30e-34 |
| ENSDARG00000017624 | krt4 | 3.777492 | -2.393940 | 2.15e-36 | 2.47e-33 |
| ENSDARG00000017246 | prx | 2.325670 | -2.619766 | 2.00e-35 | 2.16e-32 |
| ENSDARG00000024789 | mxc | 1.817686 | 2.471397 | 4.40e-34 | 4.48e-31 |
| ENSDARG00000100397 | pde6c | 1.332726 | 2.436494 | 7.25e-30 | 7.01e-27 |
| ENSDARG00000035438 | myhc4 | 1.653235 | -2.433943 | 8.48e-28 | 7.78e-25 |
topTable_19 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
title = "2019 data without cqn"
) +
scale_color_manual(values = c("grey70", "red"))
Comparison of logFC and GC content before using cqn
topTable_19 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
y = "Ranking Statistic",
title = "2019 data without cqn"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_color_manual(values = c("grey70", "red"))
Comparison of ranking statistics and GC content before using cqn
topTable_19 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveLength, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene length",
title = "2019 data without cqn"
) +
scale_color_manual(values = c("grey70", "red")) +
scale_x_log10(labels = comma)
topTable_19 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveLength, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene Length",
y = "Ranking Statistic",
title = "2019 data without cqn"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_color_manual(values = c("grey70", "red")) +
scale_x_log10(labels = comma)
# Create named vector of gene level statistics
ranks_19 <- topTable_19 %>%
dplyr::arrange(RankStat) %>%
with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_19 <- fgsea(hallmark, ranks_19, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(bonferroni = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
if (!interactive()) {
fgsea_19 %>%
dplyr::select(-leadingEdge) %>%
dplyr::slice(1:10) %>%
kable(caption = "The 10 most significantly enriched pathways") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | bonferroni |
|---|---|---|---|---|---|---|---|
| INTERFERON_GAMMA_RESPONSE | 0.0000197 | 0.0004921 | 0.7628697 | 2.247079 | 0 | 200 | 0.0009842 |
| ALLOGRAFT_REJECTION | 0.0000197 | 0.0004921 | 0.7663245 | 2.253349 | 0 | 195 | 0.0009842 |
| MYOGENESIS | 0.0001425 | 0.0023756 | -0.7025634 | -2.153505 | 6 | 216 | 0.0071267 |
| INTERFERON_ALPHA_RESPONSE | 0.0045548 | 0.0505899 | 0.7494870 | 2.028404 | 225 | 83 | 0.2277399 |
| INFLAMMATORY_RESPONSE | 0.0050590 | 0.0505899 | 0.6412325 | 1.863712 | 255 | 173 | 0.2529494 |
| OXIDATIVE_PHOSPHORYLATION | 0.0201470 | 0.1678919 | 0.5569524 | 1.652094 | 1024 | 219 | 1.0000000 |
| KRAS_SIGNALING_DN | 0.0246021 | 0.1698544 | -0.5742175 | -1.702190 | 1220 | 155 | 1.0000000 |
| COMPLEMENT | 0.0271767 | 0.1698544 | 0.5554376 | 1.636422 | 1378 | 202 | 1.0000000 |
| ESTROGEN_RESPONSE_EARLY | 0.0381706 | 0.2030876 | -0.5167204 | -1.566930 | 1876 | 196 | 1.0000000 |
| MYC_TARGETS_V1 | 0.0406175 | 0.2030876 | 0.5272316 | 1.564737 | 2067 | 220 | 1.0000000 |
# Check which genes in dgeFilt have GC content and length in gcGene
genes2keep_19cqn <- rownames(dgeFilt_19) %in% mcols(gcGene)$gene_id
dgeFilt_19cqn <- dgeFilt_19[genes2keep_19cqn,, keep.lib.sizes = FALSE]
cqn_19 <- cqn(
dgeFilt_19cqn$counts,
x = mcols(dgeFilt_19cqn$genes)$aveGc,
lengths = mcols(dgeFilt_19cqn$genes)$aveLength,
sizeFactors = dgeFilt_19cqn$samples$lib.size
)
dgeFilt_19cqn$offset <- cqn_19$glm.offset
design_19cqn <- model.matrix(~ group, data = dgeFilt_19cqn$samples)
topTable_19cqn <- estimateGLMCommonDisp(dgeFilt_19cqn) %>%
glmFit(design_19cqn) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
set_colnames(str_remove(colnames(.), "ID.")) %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr = logCPM,
logFC,
P.Value = PValue,
FDR,
Location,
aveLength,
aveGc,
maxLength,
longestGc
) %>%
mutate(
DE = FDR < 0.05,
RankStat = -sign(logFC)*log10(P.Value)
)
if (!interactive()) {
topTable_19cqn %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
dplyr::slice(1:20) %>%
mutate(
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
kable(caption = "The 20 most differentially expressed genes") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000111860 | exorh | 4.022706 | 3.439359 | 2.75e-92 | 4.84e-88 |
| ENSDARG00000009637 | rcvrn3 | 2.455893 | 3.560549 | 9.24e-77 | 8.13e-73 |
| ENSDARG00000111827 | rcvrnb | 3.231423 | 3.144133 | 6.92e-73 | 4.06e-69 |
| ENSDARG00000098249 | asmt | 2.198911 | 3.333733 | 4.53e-68 | 1.99e-64 |
| ENSDARG00000059163 | rbp3 | 2.267500 | 3.302384 | 3.87e-67 | 1.36e-63 |
| ENSDARG00000002768 | pvalb2 | 2.263883 | -3.395165 | 1.17e-58 | 3.42e-55 |
| ENSDARG00000024877 | ptgr1 | 4.651772 | 2.231490 | 5.25e-46 | 1.32e-42 |
| ENSDARG00000053254 | mylpfa | 2.418812 | -2.793595 | 1.32e-45 | 2.90e-42 |
| ENSDARG00000075963 | mhc1uba | 5.946181 | 2.150825 | 4.69e-45 | 9.17e-42 |
| ENSDARG00000040565 | ckmb | 2.658295 | -2.676428 | 1.39e-44 | 2.45e-41 |
| ENSDARG00000035327 | ckma | 2.375427 | -2.680239 | 2.80e-42 | 4.48e-39 |
| ENSDARG00000067990 | myhz1.1 | 2.647735 | -2.604647 | 7.86e-42 | 1.15e-38 |
| ENSDARG00000099197 | actc1b | 3.668377 | -2.325127 | 4.04e-41 | 5.47e-38 |
| ENSDARG00000017624 | krt4 | 3.802966 | -2.289225 | 1.11e-40 | 1.39e-37 |
| ENSDARG00000024789 | mxc | 1.847148 | 2.479242 | 8.69e-40 | 1.02e-36 |
| ENSDARG00000017246 | prx | 2.350905 | -2.402629 | 3.08e-35 | 3.38e-32 |
| ENSDARG00000100397 | pde6c | 1.366511 | 2.472544 | 5.25e-34 | 5.43e-31 |
| ENSDARG00000074345 | si:ch211-214b16.4 | 3.466161 | 1.918083 | 1.45e-32 | 1.42e-29 |
| ENSDARG00000068457 | tnnt3b | 1.919037 | -2.226803 | 2.72e-29 | 2.52e-26 |
| ENSDARG00000010729 | CABZ01073795.1 | 3.744796 | 1.772594 | 4.87e-29 | 4.29e-26 |
# Check similarity of genes
topTable_19cqn$Geneid[topTable_19cqn$DE == TRUE] %in%
topTable_19$Geneid[topTable_19$DE == TRUE] %>%
table()
## .
## FALSE TRUE
## 54 271
topTable_19cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
title = "2019 data with cqn"
) +
scale_color_manual(values = c("grey70", "red"))
topTable_19cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
y = "Ranking Statistic",
title = "2019 data with cqn"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_color_manual(values = c("grey70", "red"))
topTable_19cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveLength)) %>%
ggplot(aes(aveLength, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene length",
title = "2019 data with cqn"
) +
scale_color_manual(values = c("grey70", "red")) +
scale_x_log10(labels = comma)
topTable_19cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveLength)) %>%
ggplot(aes(aveLength, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene Length",
y = "Ranking Statistic",
title = "2019 data with cqn"
) +
scale_color_manual(values = c("grey70", "red")) +
coord_cartesian(ylim = c(-10, 10)) +
scale_x_log10(labels = comma)
# Create named vector of gene level statistics
ranks_19cqn <- topTable_19cqn %>%
dplyr::arrange(RankStat) %>%
with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_19cqn <- fgsea(hallmark, ranks_19cqn, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
if (!interactive()) {
fgsea_19cqn %>%
dplyr::select(-leadingEdge) %>%
dplyr::slice(1:10) %>%
kable(caption = "The 10 most significantly enriched pathways") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | padj |
|---|---|---|---|---|---|---|---|
| MYOGENESIS | 0.0000241 | 0.0008592 | -0.7136154 | -2.158272 | 0 | 216 | 0.0012046 |
| INTERFERON_GAMMA_RESPONSE | 0.0000344 | 0.0008592 | 0.7884000 | 2.219730 | 1 | 200 | 0.0017183 |
| ALLOGRAFT_REJECTION | 0.0001031 | 0.0017190 | 0.7611030 | 2.138609 | 5 | 195 | 0.0051571 |
| INTERFERON_ALPHA_RESPONSE | 0.0015861 | 0.0198259 | 0.8062983 | 2.099017 | 87 | 83 | 0.0793036 |
| INFLAMMATORY_RESPONSE | 0.0091171 | 0.0911707 | 0.6523203 | 1.813266 | 525 | 173 | 0.4558533 |
| ESTROGEN_RESPONSE_EARLY | 0.0181522 | 0.1512687 | -0.5376487 | -1.610285 | 758 | 196 | 0.9076125 |
| KRAS_SIGNALING_DN | 0.0262577 | 0.1875549 | -0.5551186 | -1.623564 | 1117 | 155 | 1.0000000 |
| COMPLEMENT | 0.0325202 | 0.2032512 | 0.5813006 | 1.637586 | 1892 | 202 | 1.0000000 |
| ESTROGEN_RESPONSE_LATE | 0.0517571 | 0.2589051 | -0.4915559 | -1.471478 | 2164 | 195 | 1.0000000 |
| HEME_METABOLISM | 0.0517810 | 0.2589051 | -0.4915485 | -1.471455 | 2165 | 195 | 1.0000000 |
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000091368 | AL954327.1 | 2.656650 | -5.906385 | 1.45e-98 | 2.67e-94 |
| ENSDARG00000111860 | exorh | 3.993233 | 3.405441 | 1.63e-74 | 1.49e-70 |
| ENSDARG00000009637 | rcvrn3 | 2.423242 | 3.635172 | 4.41e-68 | 2.70e-64 |
| ENSDARG00000111827 | rcvrnb | 3.200713 | 3.212106 | 3.16e-63 | 1.45e-59 |
| ENSDARG00000059163 | rbp3 | 2.234735 | 3.296390 | 3.12e-57 | 1.14e-53 |
| ENSDARG00000098249 | asmt | 2.165764 | 3.257285 | 9.65e-56 | 2.95e-52 |
| ENSDARG00000002768 | pvalb2 | 2.238580 | -3.378321 | 4.41e-50 | 1.16e-46 |
| ENSDARG00000093753 | BX004774.2 | 2.161002 | 2.809252 | 2.75e-44 | 6.32e-41 |
| ENSDARG00000067990 | myhz1.1 | 2.622203 | -2.876345 | 9.63e-43 | 1.96e-39 |
| ENSDARG00000053254 | mylpfa | 2.393426 | -2.878776 | 3.33e-41 | 6.11e-38 |
| ENSDARG00000040565 | ckmb | 2.632848 | -2.787234 | 8.11e-41 | 1.35e-37 |
| ENSDARG00000024877 | ptgr1 | 4.624935 | 2.330074 | 1.38e-40 | 2.11e-37 |
| ENSDARG00000035327 | ckma | 2.350054 | -2.799236 | 2.80e-39 | 3.95e-36 |
| ENSDARG00000075963 | mhc1uba | 5.918828 | 2.189626 | 1.24e-37 | 1.62e-34 |
| ENSDARG00000099197 | actc1b | 3.642893 | -2.441845 | 3.51e-37 | 4.30e-34 |
| ENSDARG00000017624 | krt4 | 3.777492 | -2.393940 | 2.15e-36 | 2.47e-33 |
| ENSDARG00000017246 | prx | 2.325670 | -2.619766 | 2.00e-35 | 2.16e-32 |
| ENSDARG00000024789 | mxc | 1.817686 | 2.471397 | 4.40e-34 | 4.48e-31 |
| ENSDARG00000100397 | pde6c | 1.332726 | 2.436494 | 7.25e-30 | 7.01e-27 |
| ENSDARG00000035438 | myhc4 | 1.653235 | -2.433943 | 8.48e-28 | 7.78e-25 |
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000111860 | exorh | 4.022706 | 3.439359 | 2.75e-92 | 4.84e-88 |
| ENSDARG00000009637 | rcvrn3 | 2.455893 | 3.560549 | 9.24e-77 | 8.13e-73 |
| ENSDARG00000111827 | rcvrnb | 3.231423 | 3.144133 | 6.92e-73 | 4.06e-69 |
| ENSDARG00000098249 | asmt | 2.198911 | 3.333733 | 4.53e-68 | 1.99e-64 |
| ENSDARG00000059163 | rbp3 | 2.267500 | 3.302384 | 3.87e-67 | 1.36e-63 |
| ENSDARG00000002768 | pvalb2 | 2.263883 | -3.395165 | 1.17e-58 | 3.42e-55 |
| ENSDARG00000024877 | ptgr1 | 4.651772 | 2.231490 | 5.25e-46 | 1.32e-42 |
| ENSDARG00000053254 | mylpfa | 2.418812 | -2.793595 | 1.32e-45 | 2.90e-42 |
| ENSDARG00000075963 | mhc1uba | 5.946181 | 2.150825 | 4.69e-45 | 9.17e-42 |
| ENSDARG00000040565 | ckmb | 2.658295 | -2.676428 | 1.39e-44 | 2.45e-41 |
| ENSDARG00000035327 | ckma | 2.375427 | -2.680239 | 2.80e-42 | 4.48e-39 |
| ENSDARG00000067990 | myhz1.1 | 2.647735 | -2.604647 | 7.86e-42 | 1.15e-38 |
| ENSDARG00000099197 | actc1b | 3.668377 | -2.325127 | 4.04e-41 | 5.47e-38 |
| ENSDARG00000017624 | krt4 | 3.802966 | -2.289225 | 1.11e-40 | 1.39e-37 |
| ENSDARG00000024789 | mxc | 1.847148 | 2.479242 | 8.69e-40 | 1.02e-36 |
| ENSDARG00000017246 | prx | 2.350905 | -2.402629 | 3.08e-35 | 3.38e-32 |
| ENSDARG00000100397 | pde6c | 1.366511 | 2.472544 | 5.25e-34 | 5.43e-31 |
| ENSDARG00000074345 | si:ch211-214b16.4 | 3.466161 | 1.918083 | 1.45e-32 | 1.42e-29 |
| ENSDARG00000068457 | tnnt3b | 1.919037 | -2.226803 | 2.72e-29 | 2.52e-26 |
| ENSDARG00000010729 | CABZ01073795.1 | 3.744796 | 1.772594 | 4.87e-29 | 4.29e-26 |
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | bonferroni |
|---|---|---|---|---|---|---|---|
| INTERFERON_GAMMA_RESPONSE | 0.0000197 | 0.0004921 | 0.7628697 | 2.247079 | 0 | 200 | 0.0009842 |
| ALLOGRAFT_REJECTION | 0.0000197 | 0.0004921 | 0.7663245 | 2.253349 | 0 | 195 | 0.0009842 |
| MYOGENESIS | 0.0001425 | 0.0023756 | -0.7025634 | -2.153505 | 6 | 216 | 0.0071267 |
| INTERFERON_ALPHA_RESPONSE | 0.0045548 | 0.0505899 | 0.7494870 | 2.028404 | 225 | 83 | 0.2277399 |
| INFLAMMATORY_RESPONSE | 0.0050590 | 0.0505899 | 0.6412325 | 1.863712 | 255 | 173 | 0.2529494 |
| OXIDATIVE_PHOSPHORYLATION | 0.0201470 | 0.1678919 | 0.5569524 | 1.652094 | 1024 | 219 | 1.0000000 |
| KRAS_SIGNALING_DN | 0.0246021 | 0.1698544 | -0.5742175 | -1.702190 | 1220 | 155 | 1.0000000 |
| COMPLEMENT | 0.0271767 | 0.1698544 | 0.5554376 | 1.636422 | 1378 | 202 | 1.0000000 |
| ESTROGEN_RESPONSE_EARLY | 0.0381706 | 0.2030876 | -0.5167204 | -1.566930 | 1876 | 196 | 1.0000000 |
| MYC_TARGETS_V1 | 0.0406175 | 0.2030876 | 0.5272316 | 1.564737 | 2067 | 220 | 1.0000000 |
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | padj |
|---|---|---|---|---|---|---|---|
| MYOGENESIS | 0.0000241 | 0.0008592 | -0.7136154 | -2.158272 | 0 | 216 | 0.0012046 |
| INTERFERON_GAMMA_RESPONSE | 0.0000344 | 0.0008592 | 0.7884000 | 2.219730 | 1 | 200 | 0.0017183 |
| ALLOGRAFT_REJECTION | 0.0001031 | 0.0017190 | 0.7611030 | 2.138609 | 5 | 195 | 0.0051571 |
| INTERFERON_ALPHA_RESPONSE | 0.0015861 | 0.0198259 | 0.8062983 | 2.099017 | 87 | 83 | 0.0793036 |
| INFLAMMATORY_RESPONSE | 0.0091171 | 0.0911707 | 0.6523203 | 1.813266 | 525 | 173 | 0.4558533 |
| ESTROGEN_RESPONSE_EARLY | 0.0181522 | 0.1512687 | -0.5376487 | -1.610285 | 758 | 196 | 0.9076125 |
| KRAS_SIGNALING_DN | 0.0262577 | 0.1875549 | -0.5551186 | -1.623564 | 1117 | 155 | 1.0000000 |
| COMPLEMENT | 0.0325202 | 0.2032512 | 0.5813006 | 1.637586 | 1892 | 202 | 1.0000000 |
| ESTROGEN_RESPONSE_LATE | 0.0517571 | 0.2589051 | -0.4915559 | -1.471478 | 2164 | 195 | 1.0000000 |
| HEME_METABOLISM | 0.0517810 | 0.2589051 | -0.4915485 | -1.471455 | 2165 | 195 | 1.0000000 |
# Create object for converting transcripts to genes
transGR <- transcripts(ensDb)
mcols(transGR) <- mcols(transGR)[c("tx_id", "gene_id")]
trans2Gene <- tibble(
tx_id = transGR$tx_id,
gene_id = transGR$gene_id
)
# Load counts from 2017 data obtained as a .rds from Nhi
counts_17 <- read_rds("nhiData/6month_Normoxia.rds")
# Create DGEList
dgeList_17 <- counts_17$counts %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
as_tibble() %>%
set_colnames(basename(colnames(.))) %>%
mutate(tx_id = str_remove_all(tx_id, "\\.[0-9]*$")) %>%
left_join(trans2Gene) %>%
group_by(gene_id) %>%
summarise_if(
.predicate = is.numeric,
.funs = sum
) %>%
as.data.frame() %>%
column_to_rownames("gene_id") %>%
round() %>%
set_colnames(str_remove(colnames(.), "[0-9]_MORGAN_6")) %>%
set_colnames(str_remove(colnames(.), "PN")) %>%
set_colnames(str_remove(colnames(.), "_[RS].+")) %>%
set_colnames(str_replace(colnames(.), "P", "W")) %>%
DGEList(
group = colnames(.) %>%
str_replace_all("([WQ])(.+)", "\\1") %>%
str_replace_all("W", "WT") %>%
str_replace_all("Q", "Mut") %>%
factor(levels = c("WT", "Mut")),
genes = genesGR[rownames(.)]
) %>%
calcNormFactors()
# Create logical vector of genes to keep that fit criteria
genes2keep_17 <- dgeList_17 %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt_17 <- dgeList_17[genes2keep_17,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Create model matrix
design_17 <- model.matrix(~group, data = dgeFilt_17$samples)
# Perform DE test
topTable_17 <- estimateGLMCommonDisp(dgeFilt_17) %>%
glmFit(design_17) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", start, end, sep = "-") %>%
unite("Location", seqnames, Range, strand, sep = ":") %>%
set_colnames(str_remove(colnames(.), "ID.")) %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr = logCPM,
logFC,
P.Value = PValue,
FDR,
Location,
aveLength,
aveGc,
maxLength,
longestGc
) %>%
mutate(
DE = FDR < 0.05,
RankStat = -sign(logFC)*log10(P.Value)
)
if (!interactive()) {
topTable_17 %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
dplyr::slice(1:20) %>%
mutate(
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
kable(caption = "The 20 most differentially expressed genes") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000080675 | si:dkey-71b5.7 | 3.0881824 | -3.097472 | 2.28e-63 | 4.22e-59 |
| ENSDARG00000058371 | krt5 | 4.6741362 | 2.449728 | 8.45e-59 | 7.83e-55 |
| ENSDARG00000112702 | BX537263.3 | 7.3996536 | -2.243755 | 1.88e-56 | 1.16e-52 |
| ENSDARG00000061585 | cyp4v7 | 2.3191654 | -3.139064 | 5.07e-52 | 2.35e-48 |
| ENSDARG00000087290 | si:ch211-202h22.10 | 0.6296277 | 5.591881 | 2.94e-50 | 1.09e-46 |
| ENSDARG00000036830 | krt91 | 3.9384441 | 2.120057 | 1.22e-42 | 3.76e-39 |
| ENSDARG00000074653 | si:ch211-233m11.1 | 2.6910751 | -2.277264 | 6.31e-36 | 1.67e-32 |
| ENSDARG00000093531 | slc37a4b | 0.4139975 | 3.306116 | 9.71e-28 | 2.25e-24 |
| ENSDARG00000031952 | mb | 2.5216783 | 1.889320 | 4.65e-27 | 9.57e-24 |
| ENSDARG00000036773 | s100s | 2.1504571 | 1.812137 | 6.43e-23 | 1.19e-19 |
| ENSDARG00000053480 | aqp9b | 1.7942745 | 1.755679 | 2.08e-19 | 3.51e-16 |
| ENSDARG00000096564 | CU915827.1 | 1.2875096 | 1.737070 | 2.47e-16 | 3.82e-13 |
| ENSDARG00000045367 | tuba1b | 6.6404569 | -1.126187 | 2.79e-16 | 3.98e-13 |
| ENSDARG00000091734 | traf3ip2l | 3.0840975 | -1.308960 | 7.46e-16 | 9.88e-13 |
| ENSDARG00000069093 | col2a1a | 2.6124607 | 1.350510 | 1.10e-15 | 1.36e-12 |
| ENSDARG00000020581 | otofb | 2.0502321 | 1.453107 | 2.21e-15 | 2.57e-12 |
| ENSDARG00000095578 | si:dkey-222h21.7 | 0.4110680 | 2.109993 | 4.60e-15 | 5.02e-12 |
| ENSDARG00000092604 | si:ch211-15j1.4 | 4.7446966 | 1.067221 | 7.73e-14 | 7.96e-11 |
| ENSDARG00000101598 | si:ch211-215p11.1 | 2.4986432 | 1.257070 | 1.49e-13 | 1.45e-10 |
| ENSDARG00000096273 | si:dkey-3n22.9 | 1.0162751 | 1.601951 | 7.98e-13 | 7.29e-10 |
topTable_17 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
title = "2017 data without cqn"
) +
scale_color_manual(values = c("grey70", "red"))
Comparison of logFC and GC content before using cqn
topTable_17 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
y = "Ranking Statistic",
title = "2017 data without cqn"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_color_manual(values = c("grey70", "red"))
Comparison of ranking statistics and GC content before using cqn
topTable_17 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveLength)) %>%
ggplot(aes(aveLength, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene length",
title = "2017 data without cqn"
) +
scale_color_manual(values = c("grey70", "red")) +
scale_x_log10(labels = comma)
Comparison of logFC and GC content before using cqn
topTable_17 %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveLength)) %>%
ggplot(aes(aveLength, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene Length",
y = "Ranking Statistic",
title = "2017 data without cqn"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_color_manual(values = c("grey70", "red")) +
scale_x_log10(labels = comma)
Comparison of ranking statistics and GC content before using cqn
# Create named vector of gene level statistics
ranks_17 <- topTable_17 %>%
dplyr::arrange(RankStat) %>%
with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_17 <- fgsea(hallmark, ranks_17, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(bonferroni = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
if (!interactive()) {
fgsea_17 %>%
dplyr::select(-leadingEdge) %>%
dplyr::slice(1:10) %>%
kable(caption = "The 10 most significantly enriched pathways") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | bonferroni |
|---|---|---|---|---|---|---|---|
| ANGIOGENESIS | 0.0127794 | 0.3806018 | 0.7426339 | 1.888033 | 701 | 33 | 0.6389718 |
| MTORC1_SIGNALING | 0.0295921 | 0.3806018 | -0.4814323 | -1.641559 | 1051 | 229 | 1.0000000 |
| MYOGENESIS | 0.0372656 | 0.3806018 | 0.4628162 | 1.504488 | 2396 | 225 | 1.0000000 |
| CHOLESTEROL_HOMEOSTASIS | 0.0385437 | 0.3806018 | -0.5582494 | -1.711517 | 1605 | 81 | 1.0000000 |
| XENOBIOTIC_METABOLISM | 0.0452206 | 0.3806018 | -0.4664074 | -1.587038 | 1616 | 223 | 1.0000000 |
| EPITHELIAL_MESENCHYMAL_TRANSITION | 0.0456722 | 0.3806018 | 0.4525985 | 1.467334 | 2927 | 219 | 1.0000000 |
| COMPLEMENT | 0.0633502 | 0.4525013 | 0.4321599 | 1.396831 | 4046 | 213 | 1.0000000 |
| P53_PATHWAY | 0.0756213 | 0.4726333 | 0.4187876 | 1.356431 | 4843 | 217 | 1.0000000 |
| OXIDATIVE_PHOSPHORYLATION | 0.0900322 | 0.5001786 | -0.4158784 | -1.414168 | 3219 | 222 | 1.0000000 |
| MYC_TARGETS_V1 | 0.1318700 | 0.5696833 | -0.3686211 | -1.253166 | 4721 | 221 | 1.0000000 |
# Check which genes in dgeFilt have GC content and length in gcGene
genes2keep_17cqn <- rownames(dgeFilt_17) %in% mcols(gcGene)$gene_id
dgeFilt_17cqn <- dgeFilt_17[genes2keep_17cqn,, keep.lib.sizes = FALSE]
cqn_17 <- cqn(
dgeFilt_17cqn$counts,
x = dgeFilt_17cqn$genes$aveGc,
lengths = dgeFilt_17cqn$genes$aveLength,
sizeFactors = dgeFilt_17cqn$samples$lib.size
)
dgeFilt_17cqn$offset <- cqn_17$glm.offset
design_17cqn <- model.matrix(~ group, data = dgeFilt_17cqn$samples)
topTable_17cqn <- estimateGLMCommonDisp(dgeFilt_17cqn) %>%
glmFit(design_17cqn) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", start, end, sep = "-") %>%
unite("Location", seqnames, Range, strand, sep = ":") %>%
set_colnames(str_remove(colnames(.), "ID.")) %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr = logCPM,
logFC,
P.Value = PValue,
FDR,
Location,
aveLength,
aveGc,
maxLength,
longestGc
) %>%
mutate(
DE = FDR < 0.05,
RankStat = -sign(logFC)*log10(P.Value)
)
if (!interactive()) {
topTable_17cqn %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
dplyr::slice(1:20) %>%
mutate(
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
kable(caption = "The 20 most differentially expressed genes") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000080675 | si:dkey-71b5.7 | 3.0884154 | -3.160407 | 2.50e-69 | 4.63e-65 |
| ENSDARG00000058371 | krt5 | 4.6721904 | 2.476419 | 2.95e-64 | 2.73e-60 |
| ENSDARG00000061585 | cyp4v7 | 2.3274742 | -3.192150 | 3.79e-56 | 2.34e-52 |
| ENSDARG00000036830 | krt91 | 3.9359598 | 2.139469 | 3.25e-46 | 1.51e-42 |
| ENSDARG00000112702 | BX537263.3 | 7.3995271 | -1.881984 | 2.12e-44 | 7.85e-41 |
| ENSDARG00000087290 | si:ch211-202h22.10 | 0.6256874 | 5.176158 | 5.82e-41 | 1.80e-37 |
| ENSDARG00000031952 | mb | 2.5176837 | 1.905665 | 7.35e-29 | 1.95e-25 |
| ENSDARG00000036773 | s100s | 2.1466136 | 1.815182 | 5.26e-24 | 1.22e-20 |
| ENSDARG00000093531 | slc37a4b | 0.4111232 | 2.925726 | 1.21e-21 | 2.49e-18 |
| ENSDARG00000053480 | aqp9b | 1.7906742 | 1.768972 | 1.93e-20 | 3.58e-17 |
| ENSDARG00000074653 | si:ch211-233m11.1 | 2.6923190 | -1.673444 | 2.18e-20 | 3.68e-17 |
| ENSDARG00000091734 | traf3ip2l | 3.0851222 | -1.358823 | 7.11e-18 | 1.10e-14 |
| ENSDARG00000069093 | col2a1a | 2.6097279 | 1.381891 | 3.67e-17 | 5.24e-14 |
| ENSDARG00000045367 | tuba1b | 6.6405206 | -1.094861 | 1.21e-16 | 1.60e-13 |
| ENSDARG00000096564 | CU915827.1 | 1.2862540 | 1.716131 | 1.68e-16 | 2.08e-13 |
| ENSDARG00000020581 | otofb | 2.0469346 | 1.440647 | 8.49e-16 | 9.83e-13 |
| ENSDARG00000092604 | si:ch211-15j1.4 | 4.7443741 | 1.064560 | 1.02e-14 | 1.12e-11 |
| ENSDARG00000086819 | scn1laa | 2.0374641 | 1.280426 | 6.88e-13 | 7.08e-10 |
| ENSDARG00000103494 | usp43a | 3.3189310 | 1.078191 | 7.42e-13 | 7.23e-10 |
| ENSDARG00000105675 | BX005417.3 | 0.3569806 | 1.876728 | 1.64e-12 | 1.52e-09 |
# Check similarity of genes
topTable_17cqn$Geneid[topTable_17cqn$DE == TRUE] %in%
topTable_17$Geneid[topTable_17$DE == TRUE] %>%
table()
## .
## FALSE TRUE
## 29 188
topTable_17cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
title = "2017 data with cqn"
) +
scale_color_manual(values = c("grey70", "red"))
Comparison of logFC and GC content after using cqn
topTable_17cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveGc)) %>%
ggplot(aes(aveGc, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "GC Content",
y = "Ranking Statistic",
title = "2017 data with cqn"
) +
coord_cartesian(ylim = c(-10, 10)) +
scale_color_manual(values = c("grey70", "red"))
Comparison of ranking statistics and GC content after using cqn
topTable_17cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveLength)) %>%
ggplot(aes(aveLength, logFC)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene length",
title = "2017 data with cqn"
) +
scale_color_manual(values = c("grey70", "red")) +
scale_x_log10(labels = comma)
Comparison of logFC and GC content after using cqn
topTable_17cqn %>%
dplyr::arrange(desc(P.Value)) %>%
dplyr::filter(!is.na(aveLength)) %>%
ggplot(aes(aveLength, RankStat)) +
geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
geom_smooth(se = FALSE) +
labs(
x = "Gene Length",
y = "Ranking Statistic",
title = "2017 data with cqn"
) +
scale_color_manual(values = c("grey70", "red")) +
coord_cartesian(ylim = c(-10, 10)) +
scale_x_log10(labels = comma)
Comparison of ranking statistics and GC content after using cqn
# Create named vector of gene level statistics
ranks_17cqn <- topTable_17cqn %>%
dplyr::arrange(RankStat) %>%
with(structure(RankStat, names = Geneid))
# Run GSEA
fgsea_17cqn <- fgsea(hallmark, ranks_17cqn, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
if (!interactive()) {
fgsea_17cqn %>%
dplyr::select(-leadingEdge) %>%
dplyr::slice(1:10) %>%
kable(caption = "The 10 most significantly enriched pathways") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | padj |
|---|---|---|---|---|---|---|---|
| ANGIOGENESIS | 0.0108945 | 0.2749676 | 0.7558817 | 1.927492 | 616 | 33 | 0.5447258 |
| MYOGENESIS | 0.0132930 | 0.2749676 | 0.5050005 | 1.653177 | 900 | 225 | 0.6646503 |
| EPITHELIAL_MESENCHYMAL_TRANSITION | 0.0203707 | 0.2749676 | 0.4901243 | 1.600737 | 1376 | 219 | 1.0000000 |
| XENOBIOTIC_METABOLISM | 0.0323571 | 0.2749676 | -0.4798916 | -1.669441 | 1042 | 223 | 1.0000000 |
| CHOLESTEROL_HOMEOSTASIS | 0.0329066 | 0.2749676 | -0.5533618 | -1.738633 | 1279 | 81 | 1.0000000 |
| MTORC1_SIGNALING | 0.0329961 | 0.2749676 | -0.4751843 | -1.657984 | 1059 | 229 | 1.0000000 |
| COMPLEMENT | 0.0465185 | 0.3322753 | 0.4454749 | 1.450777 | 3131 | 213 | 1.0000000 |
| P53_PATHWAY | 0.0569722 | 0.3560760 | 0.4313166 | 1.407191 | 3842 | 217 | 1.0000000 |
| INFLAMMATORY_RESPONSE | 0.0909277 | 0.4586587 | -0.3958120 | -1.346656 | 3111 | 172 | 1.0000000 |
| OXIDATIVE_PHOSPHORYLATION | 0.0917317 | 0.4586587 | -0.4016592 | -1.396654 | 2959 | 222 | 1.0000000 |
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000080675 | si:dkey-71b5.7 | 3.0881824 | -3.097472 | 2.28e-63 | 4.22e-59 |
| ENSDARG00000058371 | krt5 | 4.6741362 | 2.449728 | 8.45e-59 | 7.83e-55 |
| ENSDARG00000112702 | BX537263.3 | 7.3996536 | -2.243755 | 1.88e-56 | 1.16e-52 |
| ENSDARG00000061585 | cyp4v7 | 2.3191654 | -3.139064 | 5.07e-52 | 2.35e-48 |
| ENSDARG00000087290 | si:ch211-202h22.10 | 0.6296277 | 5.591881 | 2.94e-50 | 1.09e-46 |
| ENSDARG00000036830 | krt91 | 3.9384441 | 2.120057 | 1.22e-42 | 3.76e-39 |
| ENSDARG00000074653 | si:ch211-233m11.1 | 2.6910751 | -2.277264 | 6.31e-36 | 1.67e-32 |
| ENSDARG00000093531 | slc37a4b | 0.4139975 | 3.306116 | 9.71e-28 | 2.25e-24 |
| ENSDARG00000031952 | mb | 2.5216783 | 1.889320 | 4.65e-27 | 9.57e-24 |
| ENSDARG00000036773 | s100s | 2.1504571 | 1.812137 | 6.43e-23 | 1.19e-19 |
| ENSDARG00000053480 | aqp9b | 1.7942745 | 1.755679 | 2.08e-19 | 3.51e-16 |
| ENSDARG00000096564 | CU915827.1 | 1.2875096 | 1.737070 | 2.47e-16 | 3.82e-13 |
| ENSDARG00000045367 | tuba1b | 6.6404569 | -1.126187 | 2.79e-16 | 3.98e-13 |
| ENSDARG00000091734 | traf3ip2l | 3.0840975 | -1.308960 | 7.46e-16 | 9.88e-13 |
| ENSDARG00000069093 | col2a1a | 2.6124607 | 1.350510 | 1.10e-15 | 1.36e-12 |
| ENSDARG00000020581 | otofb | 2.0502321 | 1.453107 | 2.21e-15 | 2.57e-12 |
| ENSDARG00000095578 | si:dkey-222h21.7 | 0.4110680 | 2.109993 | 4.60e-15 | 5.02e-12 |
| ENSDARG00000092604 | si:ch211-15j1.4 | 4.7446966 | 1.067221 | 7.73e-14 | 7.96e-11 |
| ENSDARG00000101598 | si:ch211-215p11.1 | 2.4986432 | 1.257070 | 1.49e-13 | 1.45e-10 |
| ENSDARG00000096273 | si:dkey-3n22.9 | 1.0162751 | 1.601951 | 7.98e-13 | 7.29e-10 |
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000080675 | si:dkey-71b5.7 | 3.0884154 | -3.160407 | 2.50e-69 | 4.63e-65 |
| ENSDARG00000058371 | krt5 | 4.6721904 | 2.476419 | 2.95e-64 | 2.73e-60 |
| ENSDARG00000061585 | cyp4v7 | 2.3274742 | -3.192150 | 3.79e-56 | 2.34e-52 |
| ENSDARG00000036830 | krt91 | 3.9359598 | 2.139469 | 3.25e-46 | 1.51e-42 |
| ENSDARG00000112702 | BX537263.3 | 7.3995271 | -1.881984 | 2.12e-44 | 7.85e-41 |
| ENSDARG00000087290 | si:ch211-202h22.10 | 0.6256874 | 5.176158 | 5.82e-41 | 1.80e-37 |
| ENSDARG00000031952 | mb | 2.5176837 | 1.905665 | 7.35e-29 | 1.95e-25 |
| ENSDARG00000036773 | s100s | 2.1466136 | 1.815182 | 5.26e-24 | 1.22e-20 |
| ENSDARG00000093531 | slc37a4b | 0.4111232 | 2.925726 | 1.21e-21 | 2.49e-18 |
| ENSDARG00000053480 | aqp9b | 1.7906742 | 1.768972 | 1.93e-20 | 3.58e-17 |
| ENSDARG00000074653 | si:ch211-233m11.1 | 2.6923190 | -1.673444 | 2.18e-20 | 3.68e-17 |
| ENSDARG00000091734 | traf3ip2l | 3.0851222 | -1.358823 | 7.11e-18 | 1.10e-14 |
| ENSDARG00000069093 | col2a1a | 2.6097279 | 1.381891 | 3.67e-17 | 5.24e-14 |
| ENSDARG00000045367 | tuba1b | 6.6405206 | -1.094861 | 1.21e-16 | 1.60e-13 |
| ENSDARG00000096564 | CU915827.1 | 1.2862540 | 1.716131 | 1.68e-16 | 2.08e-13 |
| ENSDARG00000020581 | otofb | 2.0469346 | 1.440647 | 8.49e-16 | 9.83e-13 |
| ENSDARG00000092604 | si:ch211-15j1.4 | 4.7443741 | 1.064560 | 1.02e-14 | 1.12e-11 |
| ENSDARG00000086819 | scn1laa | 2.0374641 | 1.280426 | 6.88e-13 | 7.08e-10 |
| ENSDARG00000103494 | usp43a | 3.3189310 | 1.078191 | 7.42e-13 | 7.23e-10 |
| ENSDARG00000105675 | BX005417.3 | 0.3569806 | 1.876728 | 1.64e-12 | 1.52e-09 |
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | bonferroni |
|---|---|---|---|---|---|---|---|
| ANGIOGENESIS | 0.0127794 | 0.3806018 | 0.7426339 | 1.888033 | 701 | 33 | 0.6389718 |
| MTORC1_SIGNALING | 0.0295921 | 0.3806018 | -0.4814323 | -1.641559 | 1051 | 229 | 1.0000000 |
| MYOGENESIS | 0.0372656 | 0.3806018 | 0.4628162 | 1.504488 | 2396 | 225 | 1.0000000 |
| CHOLESTEROL_HOMEOSTASIS | 0.0385437 | 0.3806018 | -0.5582494 | -1.711517 | 1605 | 81 | 1.0000000 |
| XENOBIOTIC_METABOLISM | 0.0452206 | 0.3806018 | -0.4664074 | -1.587038 | 1616 | 223 | 1.0000000 |
| EPITHELIAL_MESENCHYMAL_TRANSITION | 0.0456722 | 0.3806018 | 0.4525985 | 1.467334 | 2927 | 219 | 1.0000000 |
| COMPLEMENT | 0.0633502 | 0.4525013 | 0.4321599 | 1.396831 | 4046 | 213 | 1.0000000 |
| P53_PATHWAY | 0.0756213 | 0.4726333 | 0.4187876 | 1.356431 | 4843 | 217 | 1.0000000 |
| OXIDATIVE_PHOSPHORYLATION | 0.0900322 | 0.5001786 | -0.4158784 | -1.414168 | 3219 | 222 | 1.0000000 |
| MYC_TARGETS_V1 | 0.1318700 | 0.5696833 | -0.3686211 | -1.253166 | 4721 | 221 | 1.0000000 |
| pathway | pval | FDR | ES | NES | nMoreExtreme | size | padj |
|---|---|---|---|---|---|---|---|
| ANGIOGENESIS | 0.0108945 | 0.2749676 | 0.7558817 | 1.927492 | 616 | 33 | 0.5447258 |
| MYOGENESIS | 0.0132930 | 0.2749676 | 0.5050005 | 1.653177 | 900 | 225 | 0.6646503 |
| EPITHELIAL_MESENCHYMAL_TRANSITION | 0.0203707 | 0.2749676 | 0.4901243 | 1.600737 | 1376 | 219 | 1.0000000 |
| XENOBIOTIC_METABOLISM | 0.0323571 | 0.2749676 | -0.4798916 | -1.669441 | 1042 | 223 | 1.0000000 |
| CHOLESTEROL_HOMEOSTASIS | 0.0329066 | 0.2749676 | -0.5533618 | -1.738633 | 1279 | 81 | 1.0000000 |
| MTORC1_SIGNALING | 0.0329961 | 0.2749676 | -0.4751843 | -1.657984 | 1059 | 229 | 1.0000000 |
| COMPLEMENT | 0.0465185 | 0.3322753 | 0.4454749 | 1.450777 | 3131 | 213 | 1.0000000 |
| P53_PATHWAY | 0.0569722 | 0.3560760 | 0.4313166 | 1.407191 | 3842 | 217 | 1.0000000 |
| INFLAMMATORY_RESPONSE | 0.0909277 | 0.4586587 | -0.3958120 | -1.346656 | 3111 | 172 | 1.0000000 |
| OXIDATIVE_PHOSPHORYLATION | 0.0917317 | 0.4586587 | -0.4016592 | -1.396654 | 2959 | 222 | 1.0000000 |